Load data, prepared fpkm values and ortholog annonation
datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))
Set columns names
colnames(rawCounts) <- datasets$setname
rownames(rawCounts) <- geneDetails$human_name
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name
Filter out lower 30% and mitochondrial genes
rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9
## 30%
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,]))
filteredRawCounts <- rawCounts[filteredNames,]
Normalize data accounting for GC content
GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)
Depth normalize using TMM scaling factors
origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
normalizedColSums[i] <- origColSums[i]* normFactors[i]
i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
i <- i+1
}
Apply Harmony method to correct batch effect
meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
harmonized_pcs <- HarmonyMatrix(
data_mat = logTransformedDepthNormCounts,
meta_data = meta,
vars_use = "seqBatch",
)
## Harmony 1/10
## Harmony converged after 1 iterations
plotData_pca <- prcomp(harmonized_pcs[, -1])
Check PCA statistics
summary(plotData_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 0.3195 0.2753 0.2512 0.23087 0.22044 0.1873 0.16456
## Proportion of Variance 0.1794 0.1333 0.1109 0.09372 0.08544 0.0617 0.04761
## Cumulative Proportion 0.1794 0.3127 0.4236 0.51735 0.60279 0.6645 0.71209
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.15842 0.14771 0.14139 0.13924 0.12729 0.12227 0.1171
## Proportion of Variance 0.04413 0.03836 0.03515 0.03409 0.02849 0.02628 0.0241
## Cumulative Proportion 0.75622 0.79458 0.82973 0.86381 0.89230 0.91858 0.9427
## PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.11384 0.10432 0.06797 0.05103 0.03915
## Proportion of Variance 0.02279 0.01913 0.00812 0.00458 0.00270
## Cumulative Proportion 0.96547 0.98460 0.99273 0.99730 1.00000
Transfer PCA data to plots
plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- harmonized_pcs[,1]
plotData$PC2 <- harmonized_pcs[,2]
plotData$PC3 <- harmonized_pcs[,3]
Plot the first and the second principal components

Plot the first and the second principal components with centroids

Plot the first, the second and the third principal components
Test for significance of correlations between the matched tissues PC values of human and mouse
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.3797, df = 11, p-value = 0.1951
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2117066 0.7717468
## sample estimates:
## cor
## 0.3840807
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 2.1433, df = 11, p-value = 0.05529
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01174541 0.84195285
## sample estimates:
## cor
## 0.5427523
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson")
##
## Pearson's product-moment correlation
##
## data: plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.957, df = 11, p-value = 0.002246
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3728563 0.9262503
## sample estimates:
## cor
## 0.7663947